from bs4 import BeautifulSoup
from docx import Document
import requests
# 目标URL
url = 'https://fgk.chinatax.gov.cn/zcfgk/c102416/c5239243/content.html'

# 发送HTTP请求获取网页内容
response = requests.get(url)
response.encoding = 'utf-8'  # 设置编码，防止乱码

# 使用BeautifulSoup解析HTML
soup = BeautifulSoup(response.text, 'html.parser')

# 查找<div class="container">元素
container_div = soup.find('div', class_='arc_cont')
# 提取<div class="container">中的文本内容
if container_div:
    article_content = container_div.get_text(strip=True, separator='\n')
    print(article_content)

     
# r = requests.get(d1["path"],headers=headers)
# print(r.text)
    
# for d in datalist:
#     r = requests.get(d["path"],headers=headers)
#     print(r.text)
    
#https://fgk.chinatax.gov.cn/zcfgk/c102416/c5239243/content.html
#http://www.chinatax.gov.cn/zcfgk/c102416/c5239243/content.html
#urljoin('https://www.baidu.com', 'FAQ.html')